Every choice we make along our RNA-seq analysis pipeline will affect the results we get and the conclusions we make about the data. Here, we document and explain the choices we make for each critical step in our PDX analysis:
# Load packages
source(here::here('packages.R'))
#Read in PDX RDS object
PDX = readRDS("data/Izar_2020/Izar_2020_PDX.RDS")
treatment_levels <- c("vehicle", "MRD", "relapse")
PDX$treatment.status = factor(PDX$treatment.status, levels = treatment_levels)
# Read in gene lists
ccgenes = read_lines("gene_lists/regev_lab_cell_cycle_genes.txt")
s.genes <- ccgenes[1:43]
g2m.genes <- ccgenes[44:97]
#Read in hallmarks of interest
hallmark_names = read_lines("gene_lists/hallmarks.txt")
hallmark.list <- vector(mode = "list", length = length(hallmark_names))
names(hallmark.list) <- hallmark_names
for(hm in hallmark_names){
file <- read_lines(glue("hallmarks/{hm}_updated.txt"), skip = 1)
hallmark.list[[hm]] <- file
}
if(!dir.exists("jesslyn_plots/PDX_test")){dir.create("jesslyn_plots/PDX_test")}
IMPORTANCE: scaling makes count data more comparable across cells and across genes, and how we scale our data affects downstream analysis such as dimensional reduction (PCA).
#Scale PDX data in the four different ways
PDX_orig = ScaleData(PDX, do.scale = F, do.center = F)
PDX_center = ScaleData(PDX, do.scale = F, do.center = T)
PDX_scale = ScaleData(PDX, do.scale = T, do.center = F)
PDX_scale_center = ScaleData(PDX, do.scale = T, do.center = T)
#COMPARE EACH SCENARIO THROUGH VISUALIZATION
#compute and plot the mean expression per cell in each scenario
df = data.frame(
"orig" = colMeans(PDX_orig[["RNA"]]@scale.data),
"center" = colMeans(PDX_center[["RNA"]]@scale.data),
"scale" = colMeans(PDX_scale[["RNA"]]@scale.data),
"scale-center" = colMeans(PDX_scale_center[["RNA"]]@scale.data)
)
plot.df = reshape2::melt(df)
p1 = ggplot(plot.df, aes(x = variable, y = value, fill = variable)) +
geom_violin() +
labs(x = "PDX data norm type", y = "mean expression/cell", title = "expression/cell (Plot #1)") +
theme_bw()
#compute and plot the mean expression per gene in each scenario
df = data.frame(
"orig" = rowMeans(PDX_orig[["RNA"]]@scale.data),
"center" = rowMeans(PDX_center[["RNA"]]@scale.data),
"scale" = rowMeans(PDX_scale[["RNA"]]@scale.data),
"scale-center" = rowMeans(PDX_scale_center[["RNA"]]@scale.data)
)
plot.df = reshape2::melt(df)
p2 = ggplot(plot.df, aes(x = variable, y = value, fill = variable)) +
geom_violin() +
labs(x = "PDX data norm type", y = "mean expression/gene", title = "evaluate centering (Plot #2)") +
ylim(-5,10) +
theme_bw()
#compute and plot the standard deviation across cells per gene in each scenario
sd.df = data.frame(
"orig" = apply(PDX_orig[["RNA"]]@scale.data,1,sd),
"center" = apply(PDX_center[["RNA"]]@scale.data,1,sd),
"scale" = apply(PDX_scale[["RNA"]]@scale.data,1,sd),
"scale-center" = apply(PDX_scale_center[["RNA"]]@scale.data,1,sd)
)
plot.df = reshape2::melt(sd.df)
p3 = ggplot(plot.df, aes(x = variable, y = value, fill = variable)) +
geom_violin() +
labs(x = "PDX data norm type", y = "SD/gene", title = "evaluate scaling (Plot #3)") +
theme_bw()
p1+p2+p3 + patchwork::plot_layout(nrow = 1)
ggsave(filename = "PDX_data_scaletest.png", path = "jesslyn_plots/PDX_test", width = 15, height = 5)
We previously decided the parameters for ScaleData (do.scale = do.center = T). Now, we’d like to see the effect of scaling before subsetting vs. scaling each model individually after subsetting by model, to decide which scenario is best.
QUESTION: Should we scale our data before subsetting by model, or should we subset by model first?
IMPORTANCE: We are interested in comparing gene expression between treatment conditions within each model. Considering that the difference between models are so drastic, using count data that is scaled so that it is comparable across models would mask over the smaller differences between treatment conditions within a specific model. It is therefore a significant step to make sure that our count data in each model is scaled so that it is comparable across all cells within a specific model, instead of across models.
HYPOTHESIS: We hypothesize that we should subset by model first, before scaling the data in each model separately, so that the data for each model would be scaled across all cells within the specific model itself.
#scenario 1: scale first then subset (PDX -> scale -> subset)
scale_center_DF20 <- subset(PDX_scale_center, subset = (model_ID == "DF20"))
#scenario 2: subset first, then scale (PDX -> subset -> scale)
DF20 <- subset(PDX, subset = model_ID == "DF20")
DF20_scale_center <- ScaleData(DF20, do.scale = T, do.center = T)
#scenario 3: scale first, subset, and scale again (PDX -> scale -> subset -> scale)
scale_center_DF20_scale_center <- ScaleData(scale_center_DF20, do.scale = T, do.center = T)
#COMPARE EACH SCENARIO THROUGH VISUALIZATION
#mean expression per cell within model DF20
cell.mean.df = data.frame(
"scale-center-DF20" = colMeans(scale_center_DF20[["RNA"]]@scale.data),
"DF20-scale-center" = colMeans(DF20_scale_center[["RNA"]]@scale.data),
"before-DF20-after" = colMeans(scale_center_DF20_scale_center[["RNA"]]@scale.data)
)
plot.df = reshape2::melt(cell.mean.df)
p1 = ggplot(plot.df, aes(x = variable, y = value, fill = variable)) +
geom_violin() +
labs(x = "PDX scale vs. subset sequence", y = "mean expression/cell", title = "expression/cell") +
theme_bw()
#mean expression per gene across all cells within model DF20
gene.mean.df = data.frame(
"scale-center-DF20" = rowMeans(scale_center_DF20[["RNA"]]@scale.data),
"DF20-scale-center" = rowMeans(DF20_scale_center[["RNA"]]@scale.data),
"before-DF20-after" = rowMeans(scale_center_DF20_scale_center[["RNA"]]@scale.data)
)
plot.df = reshape2::melt(gene.mean.df)
p2 = ggplot(plot.df, aes(x = variable, y = value, fill = variable)) +
geom_violin() +
labs(x = "PDX scale vs. subset sequence", y = "mean expression/gene", title = "evaluate the mean expression per gene") +
theme_bw()
#standard deviation per gene across all cells within model DF20
sd.df = data.frame(
"scale-center-DF20" = apply(scale_center_DF20[["RNA"]]@scale.data,1,sd),
"DF20-scale-center" = apply(DF20_scale_center[["RNA"]]@scale.data,1,sd),
"before-DF20-after" = apply(scale_center_DF20_scale_center[["RNA"]]@scale.data,1,sd)
)
plot.df = reshape2::melt(sd.df)
p3 = ggplot(plot.df, aes(x = variable, y = value, fill = variable)) +
geom_violin() +
labs(x = "PDX scale vs. subset sequence", y = "variance/gene", title = "evaluate the variance in expression per gene") +
theme_bw()
p1+p2+p3 + patchwork::plot_layout(nrow = 1)
ggsave(filename = "PDX_data_scaleVSsubset.png", path = "jesslyn_plots/PDX_test", width = 15, height = 5)
before.DF20.after = scale and center across all cells, subset by model, scale and center within each model again
QUESTION: Which data slot does AddModuleScore use?
IMPORTANCE: The type of data we use to score our cells will drastically affect downstream DE analysis.
HYPOTHESIS: we hypothesize that AddModuleScore uses the normalized but unscaled, uncentered “data” slot.
#Calculate module score on the unscaled, uncentered PDX data, and plot the distribution of OXPHOS scores
PDX_orig <- AddModuleScore(PDX_orig, features = hallmark.list, name = names(hallmark.list), nbin = 25, search = T)
p1 <- VlnPlot(PDX_orig, features = "HALLMARK_OXIDATIVE_PHOSPHORYLATION25") + labs(title = "oxphos orig.score distribution", x = "PDX_orig")
#Calculate module score on scaled and centered PDX data, and plot the distribution of OXPHOS scores
PDX_scale_center <- AddModuleScore(PDX_scale_center, features = hallmark.list, name = names(hallmark.list), nbin = 25, search = T)
p2 <- VlnPlot(PDX_scale_center, features = "HALLMARK_OXIDATIVE_PHOSPHORYLATION25") + labs(title = "oxphos scale.center.score distribution", x = "PDX_scale_center")
p1 + p2 + plot_layout(guides= 'collect', nrow = 1, ncol = 2)
ggsave(filename = "PDX_data_addScoreTest.png", path = "jesslyn_plots/PDX_test", width = 10, height = 5)
Our current workflow: PDX -> subset -> scale and center separately
IMPORTANCE: Since we are interested in detecting differential expression across treatment conditions within each model, it is important to investigate whether there are significant differences between scoring cells before vs. after subsetting by model, and to determine which workflow would be better.
#Scenario 1: PDX -> scale -> add module score -> subset
score_DF20 <- subset(PDX_scale_center, subset = (model_ID == "DF20"))
p3 <- VlnPlot(score_DF20, features = "HALLMARK_OXIDATIVE_PHOSPHORYLATION25") + labs(title = "OXPHOS score first subset later (DF20)") + theme(plot.title = element_text(size = 8))
#Scenario 2: PDX -> subset -> scale -> add module score
DF20_score <- AddModuleScore(DF20_scale_center, features = hallmark.list, name = names(hallmark.list), nbin = 25, search = T)
p4 <- VlnPlot(DF20_score, features = "HALLMARK_OXIDATIVE_PHOSPHORYLATION25") + labs(title = "OXPHOS subset first score individually (DF20)") + theme(plot.title = element_text(size = 8))
#compare distribution of OXPHOS score between the two scenarios
p3 + p4 + plot_layout(guides= 'collect', nrow = 1, ncol = 2)
ggsave(filename = "PDX_data_scoreVSsubset.png", path = "jesslyn_plots/PDX_test", width = 10, height = 5)
We wonder if it would be better to “force” AddModuleScore in using the “scale.data” slot (scaled and centered by model) instead, since the counts in the “scale.data” slot is relative to all cells within DF20, instead of across all models.
IMPORTANCE: The type of data we use to score our cells may drastically affect downstream DE analysis.
HYPOTHESIS: We hypothesize that using the “scale.data” slot would be best, since it is scaled across all cells within a specific model.
hms <- c("HALLMARK_OXIDATIVE_PHOSPHORYLATION25", "HALLMARK_UNFOLDED_PROTEIN_RESPONSE33", "HALLMARK_P53_PATHWAY26")
#Scenario 1: Using the "data" slot (basically scenario 2 from above, DF20_score object)
p1 <- VlnPlot(DF20_score, features = hms, combine= F)
p1[[1]] <- p1[[1]] + labs(title = "OXPHOS ('data')", x = "DF20")
p1[[2]] <- p1[[2]] + labs(title = "UPR ('data')", x = "DF20")
p1[[3]] <- p1[[3]] + labs(title = "p53 ('data')", x = "DF20")
#Scenario 2: Using the "scale.data" slot
scale.data <- GetAssayData(object = DF20_scale_center, slot = "scale.data")
DF20_scale_center_forced <- SetAssayData(object = DF20_scale_center, slot = "data", new.data = scale.data, assay = "RNA")
DF20_scale.dataSlot <- AddModuleScore(DF20_scale_center_forced, features = hallmark.list, name = names(hallmark.list), nbin = 25, search = T)
p2 <- VlnPlot(DF20_scale.dataSlot, features = hms, combine = F)
p2[[1]] <- p2[[1]] + labs(title = "OXPHOS ('scale.data')", x = "DF20")
p2[[2]]<- p2[[2]] + labs(title = "UPR ('scale.data')", x = "DF20")
p2[[3]] <- p2[[3]] + labs(title = "p53 ('scale.data')", x = "DF20")
#Scenario 3: Using the "data" slot, center the scores afterward, reassign to metadata
hm.names <- names(DF20_score@meta.data)[9:42]
hms.centered <- c("HALLMARK_OXIDATIVE_PHOSPHORYLATION25.centered", "HALLMARK_UNFOLDED_PROTEIN_RESPONSE33.centered", "HALLMARK_P53_PATHWAY26.centered")
for(i in hm.names){
hm.centered <- scale(DF20_score[[i]], scale = FALSE)
DF20_score <- AddMetaData(DF20_score, hm.centered, col.name = glue("{i}.centered"))
}
p3 <- VlnPlot(DF20_score, features = hms.centered, combine = F)
p3[[1]] <- p3[[1]] + labs(title = "OXPHOS ('data' score centered)", x = "DF20")
p3[[2]] <- p3[[2]] + labs(title = "UPR ('data' score centered)", x = "DF20")
p3[[3]] <- p3[[3]] + labs(title = "p53 ('data' score centered)", x = "DF20")
#COMPARE
p1[[1]] + p3[[1]] + p2[[1]] + plot_layout(guides= 'collect', nrow = 1, ncol = 3)
ggsave(filename = "DF20_AddModuleScore_oxphos.png", path = "jesslyn_plots/PDX_test", width = 10, height = 5)
p1[[2]] + p3[[2]] + p2[[2]] + plot_layout(guides= 'collect', nrow = 1, ncol = 3)
ggsave(filename = "DF20_AddModuleScore_UPR.png", path = "jesslyn_plots/PDX_test", width = 10, height = 5)
p1[[3]] + p3[[3]] + p2[[3]] + plot_layout(guides= 'collect', nrow = 1, ncol = 3)
ggsave(filename = "DF20_AddModuleScore_p53.png", path = "jesslyn_plots/PDX_test", width = 10, height = 5)
To look into this more, we compare the variance in module score between the scenarios.
#compute standard deviation per module
DF20_data.df <- DF20_score@meta.data %>% as.data.frame()
DF20_scale.data.df <- DF20_scale.dataSlot@meta.data %>% as.data.frame()
sd.df = data.frame(
"DF20_data" = apply(DF20_data.df[9:42], 2, sd),
"DF20_data.centered" = apply(DF20_data.df[43:76], 2, sd),
"DF20_scale.data" = apply(DF20_scale.data.df[9:42],2,sd)
)
plot.df = reshape2::melt(sd.df)
ggplot(plot.df, aes(x = variable, y = value, fill = variable)) + geom_violin() +
labs(x = "DF20 AddModuleScore senario", y = "variance/module", title = "Comparing DF20 variance in module score") +
theme_bw()
ggsave(filename = "DF20_AddModuleScore_variance.png", path = "jesslyn_plots/PDX_test", width = 10, height = 5)
To further investigate this issue, we now split the violin plot (still focusing solely on DF20) by treatment status to examine if the trends we see are similar between using the data vs. scale.data slot. * We hypothesize that we should see clear trends where the MRD treatment condition differentially overexpress OXPHOS, UPR, and p53 genes, especially when using the scale.data slot.
#Scenario 1: Using the "data" slot
p1 <- VlnPlot(DF20_score, features = hms, combine= F, group.by = "treatment.status")
p1[[1]] <- p1[[1]] + labs(title = "OXPHOS ('data')", x = "DF20")
p1[[2]] <- p1[[2]] + labs(title = "UPR ('data')", x = "DF20")
p1[[3]] <- p1[[3]] + labs(title = "p53 ('data')", x = "DF20")
#Scenario 2: Using the "scale.data" slot
p2 <- VlnPlot(DF20_scale.dataSlot, features = hms, combine = F, group.by = "treatment.status")
p2[[1]] <- p2[[1]] + labs(title = "OXPHOS ('scale.data')", x = "DF20")
p2[[2]]<- p2[[2]] + labs(title = "UPR ('scale.data')", x = "DF20")
p2[[3]] <- p2[[3]] + labs(title = "p53 ('scale.data')", x = "DF20")
#Scenario 3: Using the "data" slot, center the scores afterward, reassign to metadata
p3 <- VlnPlot(DF20_score, features = hms.centered, combine = F, group.by = "treatment.status")
p3[[1]] <- p3[[1]] + labs(title = "OXPHOS ('data' score centered)", x = "DF20")
p3[[2]] <- p3[[2]] + labs(title = "UPR ('data' score centered)", x = "DF20")
p3[[3]] <- p3[[3]] + labs(title = "p53 ('data' score centered)", x = "DF20")
#COMPARE
p1[[1]] + p3[[1]] + p2[[1]] + plot_layout(guides= 'collect', nrow = 1, ncol = 3)
ggsave(filename = "DF20_AddModuleScore_oxphosByT.png", path = "jesslyn_plots/PDX_test", width = 10, height = 5)
p1[[2]] + p3[[2]] + p2[[2]] + plot_layout(guides= 'collect', nrow = 1, ncol = 3)
ggsave(filename = "DF20_AddModuleScore_UPRByT.png", path = "jesslyn_plots/PDX_test", width = 10, height = 5)
p1[[3]] + p3[[3]] + p2[[3]] + plot_layout(guides= 'collect', nrow = 1, ncol = 3)
ggsave(filename = "DF20_AddModuleScore_p53ByT.png", path = "jesslyn_plots/PDX_test", width = 10, height = 5)
Although similar trends are seen between the three scenarios, there are also some inconsistencies between them. For example: * It seems like the trends are more obvious when calculating module score with the scale.data slot. - This is what we hoped/ expected because the scale.data slot scales the counts across cells within the same model, rather than across all models (which would have masked intra-model differences due to the large inter-model differences) * For the UPR plot, it seems like MRD has the highest expression level when using the “data” slot, but not when using the “scale.data” slot. This is weird because scaling the data shouldn’t affect the relative position of the count data. For example, if a cell in treatment condition A expresses a higher level of gene A than condition B, scaling the data and controlling for the standard deviation should still make condition A express higher levels of fene A than condition B. * The results from this graph therefore tells us that there are significant differences depending on the slot we use for AddModuleScore considering how they resulted in different trends.
QUESTION: Which data slot is best for FindMarkers?
IMPORTANCE: Possible that we will get different DE genes, LogFC, and pvalues, depending on the data slot we use. This may drastically affect downstream DE Analysis such as GSEA and Volcano Plots, which both rely on the FindMarkers function.
HYPOTHESIS: We hypothesize that it would be better to use the “data” slot because the “scale.data” slot are already z-scores.
data_slot <- FindMarkers(DF20_score, group.by = "treatment.status", ident.1 = "MRD", ident.2 = "vehicle", logfc.threshold = 0) %>% rownames_to_column %>% arrange(p_val_adj, -avg_logFC) %>% select(rowname, avg_logFC, p_val_adj)
scale.data.slot <- FindMarkers(DF20_score, group.by = "treatment.status", slot = "scale.data", ident.1 = "MRD", ident.2 = "vehicle", logfc.threshold = 0) %>% rownames_to_column %>% arrange(p_val_adj, -avg_diff) %>% select(rowname, avg_diff, p_val_adj)
head(data_slot,10)
## rowname avg_logFC p_val_adj
## 1 DDRGK1 -1.0153358 1.259436e-08
## 2 RPS27 0.6464745 2.223430e-07
## 3 JAKMIP2 -0.1101530 1.312248e-06
## 4 DCUN1D2 0.1787186 1.318863e-06
## 5 RBM3 1.1735248 1.425152e-06
## 6 P4HA2 -1.3037053 3.778287e-06
## 7 RPL30 0.5399016 5.379199e-06
## 8 BMP3 -1.3228755 8.283360e-06
## 9 SPDYE5 -0.6012912 1.189196e-05
## 10 CCNL2 -0.6590741 1.823683e-05
head(scale.data.slot, 10)
## rowname avg_diff p_val_adj
## 1 DDRGK1 -1.48141112 1.259436e-08
## 2 RPS27 0.89255992 2.223430e-07
## 3 JAKMIP2 -0.08510235 1.312248e-06
## 4 DCUN1D2 -0.44624475 1.318863e-06
## 5 RBM3 0.97820541 1.425152e-06
## 6 P4HA2 -1.05384483 3.778287e-06
## 7 RPL30 0.57000373 5.379199e-06
## 8 BMP3 -0.22211938 8.283360e-06
## 9 SPDYE5 -0.72577490 1.189196e-05
## 10 CCNL2 -1.20705581 1.823683e-05
identical(data_slot$rowname, scale.data.slot$rowname)
## [1] FALSE
length(which(data_slot$rowname != scale.data.slot$rowname))/13893
## [1] 0.9512704
#compare padj values
fc.diff = data.frame(
"avg_logFC" = data_slot$p_val_adj,
"avg_diff" = scale.data.slot$p_val_adj
)
plot.df = reshape2::melt(fc.diff)
ggplot(plot.df, aes(x = variable, y = value, fill = variable)) + geom_violin() +
labs(x = "'data' vs. 'scale.data'", y = "logFC or diff", title = "Comparing the distribution of padj values") +
theme_bw()
fc.diff = data.frame(
"avg_logFC" = data_slot$avg_logFC,
"avg_diff" = scale.data.slot$avg_diff
)
plot.df = reshape2::melt(fc.diff)
ggplot(plot.df, aes(x = variable, y = value, fill = variable)) + geom_violin() +
labs(x = "'data' vs. 'scale.data'", y = "logFC or diff", title = "Comparing the distribution of avg_logFC vs. avg_diff") +
theme_bw()
IMPORTANCE: Different statistical tests use different approximations and assumptions, which may slightly affect the resulting DE genes they detect.
HYPOTHESIS: We hypothesize that the wilcox rank sum test would be the best test to use.
wilcox <- FindMarkers(DF20_score, group.by = "treatment.status", ident.1 = "MRD", ident.2 = "vehicle", logfc.threshold = 0, test.use = "wilcox") %>% rownames_to_column %>% arrange(p_val_adj, -avg_logFC) %>% select(rowname, avg_logFC, p_val_adj)
t.test <- FindMarkers(DF20_score, group.by = "treatment.status", ident.1 = "MRD", ident.2 = "vehicle", logfc.threshold = 0, test.use = "t") %>% rownames_to_column %>% arrange(p_val_adj, -avg_logFC) %>% select(rowname, avg_logFC, p_val_adj)
MAST <- FindMarkers(DF20_score, group.by = "treatment.status", ident.1 = "MRD", ident.2 = "vehicle", logfc.threshold = 0, test.use = "MAST") %>% rownames_to_column %>% arrange(p_val_adj, -avg_logFC) %>% select(rowname, avg_logFC, p_val_adj)
LR <- FindMarkers(DF20_score, group.by = "treatment.status", ident.1 = "MRD", ident.2 = "vehicle", logfc.threshold = 0, test.use = "LR") %>% rownames_to_column %>% arrange(p_val_adj, -avg_logFC) %>% select(rowname, avg_logFC, p_val_adj)
head(wilcox)
## rowname avg_logFC p_val_adj
## 1 DDRGK1 -1.0153358 1.259436e-08
## 2 RPS27 0.6464745 2.223430e-07
## 3 JAKMIP2 -0.1101530 1.312248e-06
## 4 DCUN1D2 0.1787186 1.318863e-06
## 5 RBM3 1.1735248 1.425152e-06
## 6 P4HA2 -1.3037053 3.778287e-06
head(t.test)
## rowname avg_logFC p_val_adj
## 1 DDRGK1 -1.0153358 2.489026e-08
## 2 RPS27 0.6464745 3.391686e-07
## 3 MIR4461 -0.7236460 7.459450e-06
## 4 CCNL2 -0.6590741 9.600498e-06
## 5 RBM3 1.1735248 1.807201e-05
## 6 TSPAN3 -0.9573806 2.706241e-05
head(MAST)
## rowname avg_logFC p_val_adj
## 1 DDRGK1 -1.0153358 1.760788e-07
## 2 BMP3 -1.3228755 3.613189e-07
## 3 S100A4 0.9265509 1.229044e-06
## 4 RPS27 0.6464745 1.291811e-06
## 5 RBM3 1.1735248 5.633820e-06
## 6 SDF4 -1.2864155 2.034330e-05
head(LR)
## rowname avg_logFC p_val_adj
## 1 DDRGK1 -1.0153358 3.803020e-08
## 2 RPS27 0.6464745 1.514942e-07
## 3 MIR4461 -0.7236460 3.146812e-06
## 4 CCNL2 -0.6590741 1.393802e-05
## 5 P4HA2 -1.3037053 2.393851e-05
## 6 RBM3 1.1735248 2.775418e-05
identical(wilcox$rowname, t.test$rowname) || identical(wilcox$rowname, MAST$rowname) || identical(wilcox$rowname, LR$rowname) || identical(t.test$rowname, MAST$rowname) || identical(t.test$rowname, LR$rowname) || identical(MAST$rowname, LR$rowname)
## [1] FALSE
which.diff = data.frame(
"wilcox.vs.t" = length(which(wilcox$rowname != t.test$rowname))/13893,
"wilcox.vs.MAST" = length(which(wilcox$rowname != MAST$rowname))/13893,
"wilcox.vs.LR" = length(which(wilcox$rowname != LR$rowname))/13893,
"t.vs.MAST" = length(which(t.test$rowname != MAST$rowname))/13893,
"t.vs.LR" = length(which(t.test$rowname != LR$rowname))/13893,
"MAST.vs.LR" = length(which(MAST$rowname != LR$rowname))/13893
)
rownames(which.diff) <- "% Different"
which.diff
## wilcox.vs.t wilcox.vs.MAST wilcox.vs.LR t.vs.MAST t.vs.LR MAST.vs.LR
## % Different 0.9993522 0.9947456 0.9994242 0.9994961 0.9978406 0.9961851
IMPORTANCE: We’ve seen earlier that the rankings and padj values are fairly identical in the beginning, but start to diverge towards the end of the ranked list. We now investigate how the slot we use affect GSEA results.
HYPOTHESIS: Since GSEA simply walks down the ranked genes and adds to a running sum of enrichment score when a gene in the list corresponds to the gene in the geneset, we hypothesize that we will get drastically different GSEA results considering how the ranked lists diverge towards the end.
slots <- c("data", "scale.data")
DF20.results <- vector("list", length(slots))
names(DF20.results) <- slots
genesetsOI <- c("HALLMARK_OXIDATIVE_PHOSPHORYLATION", "HALLMARK_UNFOLDED_PROTEIN_RESPONSE")
plots <- vector("list", length = 4)
i <- 1
for (st in slots){
DF20.ranks <- GSEA_code(DF20_score, slot = st, group.by = "treatment.status", fgseaGS = hallmark.list, group.1 = "MRD", group.2 = "vehicle", ranks = NULL)
DF20.results[[st]] <- GSEA_code(DF20_score, slot = st, group.by = "treatment.status", fgseaGS= hallmark.list, ranks = DF20.ranks)
#graph fgsea results for MRD vs. vehicle (oxphos and UPR)
for (geneset in genesetsOI){
padj = DF20.results[[st]] %>% filter(pathway == geneset) %>% select(padj) %>% deframe() %>% round(digits = 3)
NES = DF20.results[[st]] %>% filter(pathway == geneset) %>% select(NES) %>% deframe() %>% round(digits = 3)
plots[[i]] <- plotEnrichment(hallmark.list[[geneset]], DF20.ranks) + labs(title = glue("{geneset} GSEA Results For DF20 MRD vs. vehicle ({st}: NES = {NES}, padj = {padj})")) + theme(plot.title = element_text(size = 8))
ggsave(plot = plots[[i]], filename = glue("DF20_{st}_MRDVSvehicle_{geneset}_GSEA.png"), path = "jesslyn_plots/PDX_test/GSEA_paired/test")
i = i + 1
}
}
plots[[1]] + plots[[2]] + plots[[3]] + plots[[4]] + plot_layout(nrow = 2, ncol = 2)
QUESTION: How does the statistical test we use when calling FindMarkers affect GSEA results?
IMPORTANCE: Since we saw that each statistical test yielded different rankings, we would like to investigate how this affects our GSEA results.
HYPOTHESIS: Since GSEA computes the enrichment score of a geneset by walking down a ranked list and adds to the running sum when a gene in the ranked list corresponds to a gene in the geneset, we hypothesize that GSEA results and plots will look drastically different between different statistical tests considering that different statistical test used for FindMarkers yielded different ranks.
tests <- c("wilcox", "t", "MAST", "LR")
DF20.results.tests <- vector("list", length(tests))
names(DF20.results.tests) <- tests
plots <- vector("list", length = 8)
i <- 1
for (test in tests){
DF20.ranks <- GSEA_code(DF20_score, stattest = test, group.by = "treatment.status", fgseaGS = hallmark.list, group.1 = "MRD", group.2 = "vehicle", ranks = NULL)
DF20.results.tests[[test]] <- GSEA_code(DF20_score, group.by = "treatment.status", fgseaGS= hallmark.list, ranks = DF20.ranks)
#graph fgsea results for MRD vs. vehicle (oxphos and UTR)
for (geneset in genesetsOI){
padj = DF20.results.tests[[test]] %>% filter(pathway == geneset) %>% select(padj) %>% deframe() %>% round(digits = 3)
NES = DF20.results.tests[[test]] %>% filter(pathway == geneset) %>% select(NES) %>% deframe() %>% round(digits = 3)
plots[[i]] = plotEnrichment(hallmark.list[[geneset]], DF20.ranks) + labs(title = glue("{geneset} GSEA Results For DF20 MRD vs. vehicle ({test}: NES = {NES}, padj = {padj})")) + theme(plot.title = element_text(size = 8))
ggsave(plots[[i]], filename = glue("DF20_{test}_MRDVSvehicle_{geneset}_GSEA.png"), path = "jesslyn_plots/PDX_test/GSEA_paired/test")
i = i + 1
}
}
plots[[1]] + plots[[2]] + plots[[3]] + plots[[4]] + plots[[5]] + plots[[6]] + plots[[7]] + plots[[8]]+ plot_layout(nrow = 4, ncol = 2)
QUESTION: How does the slot we use for FindMarkers affect our Volcano Plot?
IMPORTANCE: Volcano plots help us detect individual DE genes via logFC and padj cutoffs. Although different data slots used for FindMarkers give us similar p values, they yielded different gene ranks. Furthermore, while the “data” slot report DE genes in avg_logFC, the “scale.data” slot reports values in avg_diff. We are interested in seeing whether we can detect different / more DE genes depending on the metric we use.
HYPOTHESIS: We should be able to detect different DE genes depending on the data slot we use. However we’re not sure which one would be more accurate.
plots <- vector("list", length = 6)
i <- 1
for (slt in slots){
#find all DE genes between MRD and vehicle for DF20
DEgenes <- DEAnalysis_code(DF20_score, group.by= "treatment.status", group.1 = "MRD", group.2 = "vehicle", slot = slt)
#volcano plot of all genes
plots[[i]] <- DEAnalysis_code(DF20_score, markers = DEgenes, group.by = "treatment.status", group.1 = "MRD", group.2 = "vehicle", graph = TRUE, slot = slt) + labs(title = glue("DF20 MRD vs. vehicle All DE Genes ({slt})")) + theme(plot.title = element_text(size = 8))
ggsave(plots[[i]], filename = glue("DF20_MRDvs.vehicle_allDEGenes({slt}).png"), path= glue( "jesslyn_plots/PDX_test/Volcano/test"), width = 15)
i = i + 1
#volcano plots for specific genesets
for(geneset in genesetsOI){
plots[[i]] <- DEAnalysis_code(DF20_score, markers = DEgenes, group.by = "treatment.status", group.1 = "MRD", group.2 = "vehicle", geneset = hallmark.list[[geneset]], slot = slt) + labs(title = glue("DF20 MRD vs. vehicle {geneset} DE Genes ({slt})")) + theme(plot.title = element_text(size = 8))
ggsave(plots[[i]], filename = glue("DF20_MRDvs.vehicle_{geneset}DEGenes({slt}).png"), path= glue("jesslyn_plots/PDX_test/Volcano/test"), width = 15)
i = i + 1
}
}
plots[[1]] + plots[[2]] + plots[[3]] + plots[[4]] + plots[[5]] + plots[[6]] + plot_layout(nrow = 2, ncol = 3)
QUESTION: How does the statistical test we use for FindMarkers affect our Volcano Plot?
IMPORTANCE: Volcano plots help us detect individual DE genes via logFC and padj cutoffs. Since different statistical tests yielded different logFC and padj values as demonstrated above in section #3.1.1, we are interested in visualizing whether a particular statistical test detects different / more DE genes than others.
HYPOTHESIS: We should be able to detect different DE genes depending on the statistical test we use. However we’re not sure which one would be more accurate.
plots <- vector("list", length = 12)
i <- 1
for (test in tests){
#find all DE genes between MRD and vehicle for DF20
DEgenes <- DEAnalysis_code(DF20_score, group.by= "treatment.status", group.1 = "MRD", group.2 = "vehicle", stattest = test)
#volcano plot of all genes
plots[[i]]<- DEAnalysis_code(DF20_score, markers = DEgenes, group.by = "treatment.status", group.1 = "MRD", group.2 = "vehicle", graph = TRUE, stattest = test) + labs(title = glue("DF20 MRD vs. vehicle All DE Genes ({test})")) + theme(plot.title = element_text(size = 8))
ggsave(plots[[i]], filename = glue("DF20_MRDvs.vehicle_allDEGenes({test}).png"), path= glue( "jesslyn_plots/PDX_test/Volcano/test"), width = 15)
i = i + 1
#volcano plots for specific genesets
for(geneset in genesetsOI){
plots[[i]] <- DEAnalysis_code(DF20_score, markers = DEgenes, group.by = "treatment.status", group.1 = "MRD", group.2 = "vehicle", geneset = hallmark.list[[geneset]], stattest = test) + labs(title = glue("DF20 MRD vs. vehicle {geneset} DE Genes ({test})")) + theme(plot.title = element_text(size = 8))
ggsave(plots[[i]], filename = glue("DF20_MRDvs.vehicle_{geneset}DEGenes({test}).png"), path= glue("jesslyn_plots/PDX_test/Volcano/test"), width = 15)
i = i + 1
}
}
plots[[1]] + plots[[2]] + plots[[3]] + plots[[4]] + plots[[5]] + plots[[6]] + plots[[7]] + plots[[8]]+ plots[[9]] + plots[[10]] + plots[[11]] + plots[[12]] + plot_layout(nrow = 4, ncol = 3)